Part 1

sb_locs <- read_csv("starbucks_locations.csv", show_col_types = FALSE)
sb_nutr <- read_csv("starbucks_menu_nutrition.csv", show_col_types = FALSE)
usa_pop <- read_csv("us_state_pop.csv", show_col_types = FALSE)
usa_states <- read_csv("states.csv", show_col_types = FALSE)

Part 2

sapply(sb_locs, class)
##          Brand   Store Number     Store Name Ownership Type Street Address 
##    "character"    "character"    "character"    "character"    "character" 
##           City State/Province        Country       Postcode   Phone Number 
##    "character"    "character"    "character"    "character"    "character" 
##       Timezone      Longitude       Latitude 
##    "character"      "numeric"      "numeric"
sapply(sb_nutr, class)
##        Item    Category    Calories     Fat (g)   Carb. (g)   Fiber (g) 
## "character" "character"   "numeric"   "numeric"   "numeric"   "numeric" 
## Protein (g) 
##   "numeric"
sapply(usa_pop, class)
##       state  population 
## "character"   "numeric"
sapply(usa_states, class)
##        State Abbreviation 
##  "character"  "character"
mean(is.na(sb_locs))
## [1] 0.02524639
mean(is.na(sb_nutr))
## [1] 0
mean(is.na(usa_pop))
## [1] 0
mean(is.na(usa_states))
## [1] 0

The datasets have been imported correctly and the columns have reasonable types (e.g., store and phone numbers are of type character, while longitude and calories are of type numeric). The Starbucks location dataset is 2.5% missing, while the other datasets have no missing values.

Part 3

sb_locs_state <- sb_locs |> 
  filter(Country == "US") |>
  group_by(`State/Province`) |>
  rename(state = `State/Province`) |>
  summarize(n_stores = n())

usa_pop_abbr <- full_join(x = usa_pop,
                          y = usa_states,
                          by = join_by(state == State))
  
sb_locs_state <- full_join(x = usa_pop_abbr,
                           y = sb_locs_state,
                           by = join_by(Abbreviation == state))

summary(sb_locs_state)
##     state             population       Abbreviation          n_stores     
##  Length:55          Min.   :   56882   Length:55          Min.   :   8.0  
##  Class :character   1st Qu.: 1344331   Class :character   1st Qu.:  56.5  
##  Mode  :character   Median : 3751351   Mode  :character   Median : 123.0  
##                     Mean   : 5677621                      Mean   : 266.8  
##                     3rd Qu.: 6515716                      3rd Qu.: 332.0  
##                     Max.   :37253956                      Max.   :2821.0  
##                                                           NA's   :4

Part 4

p1 <- sb_locs_state |> 
  ggplot(aes(x = population, y = n_stores, color = Abbreviation)) +
  geom_point(alpha = 0.8) +
  labs(x = "Population", y = "Number of stores") +
  theme_minimal()

p2 <- sb_nutr |> 
  ggplot(aes(x = Calories)) +
  geom_histogram() +
  facet_grid(.~Category) +
  labs(y = "Count") +
  theme_minimal()

p3 <- sb_nutr |>
  select(Item) |> 
  unnest_tokens(word, Item) |> 
  group_by(word) |> 
  count() |> 
  arrange(desc(n)) |> 
  head(20) |>
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(y = "Count", x = "Word") +
  theme_minimal()

ggplotly(p1)
ggplotly(p2)
ggplotly(p3)

Part 5

sb_nutr |> 
  plot_ly(x = ~Calories, y = ~`Carb. (g)`, type = 'scatter',
          mode = 'markers', color = ~Category) |> 
  layout(title = "Carbohydrates vs calories by food and drinks")
topwords <- sb_nutr |>
  select(Item) |> 
  unnest_tokens(word, Item) |> 
  group_by(word) |> 
  count() |> 
  arrange(desc(n)) |> 
  head(10)

sb_nutr |> 
  unnest_tokens(word, Item) |> 
  filter(word %in% topwords$word) |> 
  plot_ly(x = ~Calories, y = ~`Carb. (g)`, type = 'scatter',
          mode = 'markers', color = ~Category, hoverinfo = "text",
          text = ~paste0("Item: ", word, sep = "")) |> 
  layout(title = "Carbohydrates vs calories for items with the top 10 words",
         yaxis = list(title = "Carbohydrates (g)"),
         hovermode = "compare")

Part 6

filtered_data <- sb_nutr |> 
  unnest_tokens(word, Item) |> 
  filter(word %in% topwords$word)

filtered_data |> 
  plot_ly(x = ~word, type = "box") |> 
  add_boxplot(y = ~Calories, boxpoints = "all") |> 
  add_boxplot(y = ~`Fat (g)`, boxpoints = "all") |> 
  add_boxplot(y = ~`Carb. (g)`, boxpoints = "all") |> 
  add_boxplot(y = ~`Fiber (g)`, boxpoints = "all") |> 
  add_boxplot(y = ~`Protein (g)`, boxpoints = "all") |> 
  layout(title = "Boxplot of nutrition variables for the top 10 words",
         xaxis = list(title = "Word"), boxmode = "group",
         showlegend = FALSE)

Part 7

filtered_data |> 
  plot_ly(x = ~Calories, y = ~`Carb. (g)`, z = ~`Protein (g)`,
          type = 'scatter3d', mode = 'markers', color = ~word) |> 
  layout(title = "Carbohydrates vs calories vs protein for the top 10 words")

Part 8

set_map_details <- list(scope = 'usa',
                        projection = list(type = 'albers usa'),
                        showlakes = TRUE,
                        lakecolor = toRGB('steelblue'))

shadeLimit <- 125

sb_locs_state$hover <- with(sb_locs_state,
                            paste("Number of Starbucks: ",
                                  n_stores, '<br>', "State: ", state,
                                  '<br>', "Population: ", population))

map1 <- plot_geo(sb_locs_state, locationmode = 'USA-states') |> 
  add_trace(z = ~n_stores, text = ~hover, locations = ~Abbreviation,
            color = ~n_stores, colors = 'Purples') |> 
  layout(geo = set_map_details)

map2 <- plot_geo(sb_locs_state, locationmode = 'USA-states') |> 
  add_trace(z = ~population, text = ~hover,
            locations = ~Abbreviation, color = ~population,
            colors = 'Purples') |> 
  layout(geo = set_map_details)

subplot(map1, map2) |> 
  layout(title = "Starbucks stores (right) and population (left) by state")

The maps show that states with higher populations tend to have more Starbucks stores. For instance, California is the most populous state, and it also has the highest number of Starbucks stores. Likewise, states with low populations, such as Montana and Wyoming, have fewer Starbucks stores.